Loading Dataset¶

In [2]:
pip install lime
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 275.7/275.7 kB 4.3 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from lime) (3.7.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from lime) (1.23.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from lime) (1.10.1)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from lime) (4.66.1)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.10/dist-packages (from lime) (1.2.2)
Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.10/dist-packages (from lime) (0.19.3)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (3.1)
Requirement already satisfied: pillow!=7.1.0,!=7.1.1,!=8.3.0,>=6.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (9.4.0)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2.31.1)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2023.8.12)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (1.4.1)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (23.1)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (3.2.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.1.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (4.42.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.4.4)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... done
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=a0125bd8374e83ce8d08e854fe1fc921dd36d2600d4e162251655b427f37d124
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import xgboost as xgb
from lime import lime_tabular
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_predict
In [4]:
# Loading CSV file
file_path = '/content/slice_localization_data.csv'
df = pd.read_csv(file_path)

EDA¶

In [5]:
df.head()
Out[5]:
patientId value0 value1 value2 value3 value4 value5 value6 value7 value8 ... value375 value376 value377 value378 value379 value380 value381 value382 value383 reference
0 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.980381 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.803851
1 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.745726
2 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.687600
3 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.977008 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.629474
4 0 0.0 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 -0.25 ... -0.25 0.976833 0.0 0.0 0.0 0.0 0.0 -0.25 -0.25 21.571348

5 rows × 386 columns

In [6]:
dataset_shape = df.shape
print("Dataset shape:", dataset_shape)
Dataset shape: (53500, 386)
In [7]:
# Checking and printing of numerical and categorical columns

categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

categorical_count = len(categorical_columns)
numerical_count = len(numerical_columns)

print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)
print("Number of categorical columns:", categorical_count)
print("Number of numerical columns:", numerical_count)
Categorical columns: Index([], dtype='object')
Numerical columns: Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
       'value6', 'value7', 'value8',
       ...
       'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
       'value381', 'value382', 'value383', 'reference'],
      dtype='object', length=386)
Number of categorical columns: 0
Number of numerical columns: 386
In [8]:
# Check for null values
null_counts = df.isnull().sum()
print("Columns with null values:")
for column, count in null_counts.items():
    if count > 0:
        print(f"{column}: {count} null values")
Columns with null values:

From the above dataset, we can see that variables named as ‘value0’, ‘value1’,.. ‘value383’ contain feature values of CT scan images for each patient. The last variable is ‘reference’. This ‘reference’ is our target variable and it contains the relative location of the CT slice.

In [9]:
df.describe(include='all')
Out[9]:
patientId value0 value1 value2 value3 value4 value5 value6 value7 value8 ... value375 value376 value377 value378 value379 value380 value381 value382 value383 reference
count 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 ... 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000 53500.000000
mean 47.075701 0.059627 0.071558 0.145819 0.218728 0.274762 0.276189 0.204531 0.062281 -0.042025 ... -0.029404 0.182913 0.320112 0.359373 0.342889 0.266091 0.083049 -0.031146 -0.154524 47.028039
std 27.414240 0.174243 0.196921 0.300270 0.359163 0.378862 0.369605 0.351294 0.292232 0.268391 ... 0.085817 0.383333 0.463517 0.478188 0.471811 0.437633 0.279734 0.098738 0.122491 22.347042
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 -0.250000 ... -0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 -0.250000 -0.250000 1.738733
25% 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 29.891607
50% 46.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -0.250000 43.987893
75% 70.000000 0.000000 0.000000 0.000000 0.446429 0.684477 0.662382 0.441412 0.000000 0.000000 ... 0.000000 0.000000 0.996286 0.999677 0.999560 0.949478 0.000000 0.000000 0.000000 63.735059
max 96.000000 1.000000 1.000000 1.000000 1.000000 0.998790 0.996468 0.999334 1.000000 1.000000 ... 0.961279 1.000000 1.000000 1.000000 1.000000 1.000000 0.999857 0.996839 0.942851 97.489115

8 rows × 386 columns

In [10]:
# Getting unqiue values of the "Patient ID"
np.unique(df["patientId"])
Out[10]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])
In [11]:
print(df.columns)
Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
       'value6', 'value7', 'value8',
       ...
       'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
       'value381', 'value382', 'value383', 'reference'],
      dtype='object', length=386)
In [12]:
# Dropping off unnecessary variable ‘patientId’, separating features and target variables.
df_copy = df.drop(['patientId'], axis=1)
df_y = df_copy['reference']
df_x = df_copy.drop(['reference'], axis=1)
In [13]:
# Plotting of target variable distribution
plt.figure(figsize=(12, 8))
sns.distplot(df_y, bins=100)
plt.title("Distribution of Target Variable (Reference)")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")
plt.show()
<ipython-input-13-711025c69005>:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df_y, bins=100)
In [14]:
# Plot the distribution of means for each column in the dataset (excluding the first and last columns)
plt.figure(figsize=(8, 6))
sns.distplot(df_x.mean())
plt.title("Distribution of Means for Each Attribute")
plt.xlabel("Mean Value")
plt.ylabel("Density")
plt.show()

# Explanation: This plot helps us understand the range and distribution of the means of different attributes in the dataset. It's a useful step to take before deciding whether to apply feature scaling techniques.
# ===> We can conclude all of our attributes are in [-1,1] range, so we don't need to use feature normalize technique
<ipython-input-14-aa548682aa42>:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df_x.mean())
In [15]:
# Convert the DataFrame to a NumPy array for plotting
X = df_x.values

# Create the figure and axes for the polar plots
fig = plt.figure(figsize=(12, 10))

axes1 = fig.add_axes([0, 2, 1, 1], projection='polar')
axes2 = fig.add_axes([1, 2, 1, 1], projection='polar')
axes3 = fig.add_axes([0, 1, 1, 1], projection='polar')
axes4 = fig.add_axes([1, 1, 1, 1], projection='polar')

# Plotting first example of bone structure of the first person
axes1.plot(X[150, 1:241], 'bo', ms=10, label='Patient 1')
axes1.set_xlabel("Bone Structure")
axes1.legend()

# Plotting first example of air inclusion of the first person
axes2.plot(X[150, 241:386], 'ro', ms=10, label='Patient 1')
axes2.set_xlabel("Air Inclusion")
axes2.legend()

# Plotting second example of bone structure of the second person
axes3.plot(X[3541, 1:241], 'bo', ms=10, label='Patient 2')
axes3.set_xlabel("Bone Structure")
axes3.legend()

# Plotting second example of air inclusion of the second person
axes4.plot(X[3541, 241:386], 'ro', ms=10, label='Patient 2')
axes4.set_xlabel("Air Inclusion")
axes4.legend()

plt.show()

Split the Data

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)

# View the split data
print("X_train:")
print(X_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_train:")
print(y_train.head())
print("\ny_test:")
print(y_test.head())
X_train:
         value0    value1    value2    value3    value4    value5    value6  \
19113  0.000000  0.000000  0.000000  0.000000  0.939018  0.965932  0.873580   
40279  0.114286  0.020930  0.000000  0.963090  0.680756  0.558228  0.439762   
42189  0.000000  0.000000  0.000000  0.751979  0.000000  0.842081  0.904479   
30994  0.000000  0.000000  0.914286  0.855310  0.893836  0.000000  0.000000   
19373  0.000000  0.267206  0.904605  0.972478  0.856772  0.000000  0.000000   

         value7    value8  value9  ...  value374  value375  value376  \
19113  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   
40279  0.898336  0.741252    0.00  ...       0.0       0.0  0.000000   
42189  0.943744  0.699692    0.00  ...       0.0       0.0  0.994347   
30994  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   
19373  0.000000  0.000000   -0.25  ...       0.0       0.0  0.000000   

       value377  value378  value379  value380  value381  value382  value383  
19113  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
40279  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
42189  0.999256  0.999908  0.997149       0.0       0.0       0.0       0.0  
30994  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  
19373  0.000000  0.000000  0.000000       0.0       0.0       0.0       0.0  

[5 rows x 384 columns]

X_test:
        value0    value1    value2    value3    value4    value5  value6  \
46266  0.00000  0.000000  0.000000  0.000000  0.000000  0.704285     0.0   
2778   0.86783  0.898305  0.169761  0.411677  0.950726  0.000000     0.0   
34408  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000     0.0   
13871  0.00000  0.000000  0.000000  0.000000  0.797264  0.390216     0.0   
35994  0.00000  0.000000  0.000000  0.000000  0.000000  0.000000     0.0   

       value7  value8  value9  ...  value374  value375  value376  value377  \
46266    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.797007   
2778    -0.25   -0.25   -0.25  ...       0.0       0.0       0.0  0.000000   
34408    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.999131   
13871    0.00    0.00   -0.25  ...       0.0       0.0       0.0  0.000000   
35994    0.00   -0.25   -0.25  ...       0.0       0.0       0.0  0.000000   

       value378  value379  value380  value381  value382  value383  
46266  0.999385  0.999884  0.999754       0.0      0.00     -0.25  
2778   0.988959  0.985753  0.000000       0.0     -0.25     -0.25  
34408  0.999918  0.999959  0.999382       0.0      0.00     -0.25  
13871  0.000000  0.000000  0.000000       0.0      0.00     -0.25  
35994  0.000000  0.000000  0.000000       0.0      0.00     -0.25  

[5 rows x 384 columns]

y_train:
19113    91.005317
40279    26.270760
42189    33.807090
30994    79.645955
19373    81.673863
Name: reference, dtype: float64

y_test:
46266    43.349961
2778     12.119149
34408    39.070125
13871    88.928088
35994    60.022506
Name: reference, dtype: float64
In [17]:
# Plot the distribution of the target variable before splitting
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(df_y, bins=30, color='blue', alpha=0.7)
plt.title("Distribution of Target Variable (Reference) - Before Split")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")

# Plot the distribution of the target variable after splitting
plt.subplot(1, 2, 2)
plt.hist(y_train, bins=30, color='green', alpha=0.7, label='Train')
plt.hist(y_test, bins=30, color='orange', alpha=0.7, label='Test')
plt.title("Distribution of Target Variable (Reference) - After Split")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")
plt.legend()

plt.tight_layout()
plt.show()

Apply Linear Regression¶

In [18]:
# Intializing Linear Regression Model
lm = LinearRegression()
lm.fit(X_train, y_train)
Out[18]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [19]:
# Evaluation on test set

y_pred_linear_test = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred_linear_test)
mae = mean_absolute_error(y_test, y_pred_linear_test)
r2 = r2_score(y_test, y_pred_linear_test)

print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', r2)
Test Set Mean Squared Error: 68.29434562989341
Test Set Mean Absolute Error: 6.123343272084307
Test Set R-squared: 0.8624473194580933
In [20]:
# Evaluation on training set

y_pred_linear_train = lm.predict(X_train)

mse = mean_squared_error(y_train, y_pred_linear_train)
mae = mean_absolute_error(y_train, y_pred_linear_train)
r2 = r2_score(y_train, y_pred_linear_train)

print('Train Set Mean Squared Error:', mse)
print('Train Set Mean Absolute Error:', mae)
print('Train Set R-squared:', r2)
Train Set Mean Squared Error: 67.82275427597651
Train Set Mean Absolute Error: 6.10251339719793
Train Set R-squared: 0.8645188670291543
In [21]:
# Define the function to plot learning curves
def plotLearningCurves(X, y, step):
    m, n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros((len(N_size_arr), 2))  # Updated line
    index = 0

    # Fitting Model
    lm.fit(X, y)

    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        # Splitting Training dataset with size i into train and cross-validation sets
        X_train_subset = X_train[:i]
        y_train_subset = y_train[:i]

        # Computing both mean squared error of the training dataset and cross-validation datasets predictions
        error_arr[index, 0] = mean_squared_error(y_train_subset, lm.predict(X_train_subset))
        error_arr[index, 1] = mean_squared_error(y_test, lm.predict(X_test))

        # Increasing index by 1
        index += 1

    # Initializing the figure
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_axes([0, 0, 1, 1])
    ax.set_yscale('log')

    # Plotting "Training set size" vs. "Mean Squared Error" for both the training and cross-validation dataset's errors
    line1, = ax.plot(N_size_arr, error_arr[:, 0], c='red')
    line2, = ax.plot(N_size_arr, error_arr[:, 1], c='blue')

    # Adding labels and legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")

    ax.legend((line1, line2), ("Train Error", "Test Error"))

# Call the function to plot the learning curves
plotLearningCurves(X_train, y_train, 200)
In [22]:
# Predecting Reference values with the test dataset
y_pred = lm.predict(X_test)

# Plotting predictions vs. y_test
fig = plt.figure(figsize=(8, 5))
ax = fig.add_axes([0, 0, 1, 1])

ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)

# Display the plot
plt.show()

Based on the above information, it appears that the model might be slightly overfitting. Overfitting occurs when a model learns the training data too well and performs poorly on unseen data.

The mean squared error (MSE) and mean absolute error (MAE) on the training dataset are slightly lower than on the test dataset. Additionally, the R-squared value on the training dataset is higher than on the test dataset.

High complexity model: Overfitting can occur when the model is too complex relative to the available data. With a large number of features (386) compared to the number of instances (53500), it's possible that the model has learned noise or irrelevant patterns in the training data.

PCA (Principal Component Analysis) process¶

Feature Standarization

  1. perform the PCA to obtain the principal components and their corresponding explained variance
In [23]:
scaler = StandardScaler()
scaled_df_x = scaler.fit_transform(df_x)
# Create a PCA (Principal Component Analysis) instance with desired variance explained threshold
pca = PCA(0.75)
pca_vectors = pca.fit_transform(scaled_df_x)
for index, var in enumerate(pca.explained_variance_ratio_):
    print("Explained Variance ratio by Principal Component ", (index+1), " : ", var)
Explained Variance ratio by Principal Component  1  :  0.14855715257210392
Explained Variance ratio by Principal Component  2  :  0.12108312979894122
Explained Variance ratio by Principal Component  3  :  0.06474442177698231
Explained Variance ratio by Principal Component  4  :  0.03774984839961066
Explained Variance ratio by Principal Component  5  :  0.03510333274248252
Explained Variance ratio by Principal Component  6  :  0.025645225528655405
Explained Variance ratio by Principal Component  7  :  0.02330954175071417
Explained Variance ratio by Principal Component  8  :  0.021428275145390387
Explained Variance ratio by Principal Component  9  :  0.017614067903705406
Explained Variance ratio by Principal Component  10  :  0.01589391279292687
Explained Variance ratio by Principal Component  11  :  0.014032726489128131
Explained Variance ratio by Principal Component  12  :  0.012871921022909665
Explained Variance ratio by Principal Component  13  :  0.0121634919283602
Explained Variance ratio by Principal Component  14  :  0.01072092397679328
Explained Variance ratio by Principal Component  15  :  0.009760345345618412
Explained Variance ratio by Principal Component  16  :  0.009482151161428901
Explained Variance ratio by Principal Component  17  :  0.008695231697355841
Explained Variance ratio by Principal Component  18  :  0.008205878792193493
Explained Variance ratio by Principal Component  19  :  0.008041977514589788
Explained Variance ratio by Principal Component  20  :  0.007543652519555233
Explained Variance ratio by Principal Component  21  :  0.006931981984392693
Explained Variance ratio by Principal Component  22  :  0.00646707766389355
Explained Variance ratio by Principal Component  23  :  0.006083596311372858
Explained Variance ratio by Principal Component  24  :  0.005868832669382626
Explained Variance ratio by Principal Component  25  :  0.005741237892731493
Explained Variance ratio by Principal Component  26  :  0.005545363955500639
Explained Variance ratio by Principal Component  27  :  0.0053812466975905315
Explained Variance ratio by Principal Component  28  :  0.005160938036779552
Explained Variance ratio by Principal Component  29  :  0.005084092580391446
Explained Variance ratio by Principal Component  30  :  0.004799621223915218
Explained Variance ratio by Principal Component  31  :  0.004652832065005538
Explained Variance ratio by Principal Component  32  :  0.004574687218755086
Explained Variance ratio by Principal Component  33  :  0.004478718115698722
Explained Variance ratio by Principal Component  34  :  0.004312607244892595
Explained Variance ratio by Principal Component  35  :  0.004197561850896129
Explained Variance ratio by Principal Component  36  :  0.004096225788266026
Explained Variance ratio by Principal Component  37  :  0.004050004696179025
Explained Variance ratio by Principal Component  38  :  0.0040203530745072285
Explained Variance ratio by Principal Component  39  :  0.0039032597942033425
Explained Variance ratio by Principal Component  40  :  0.0037259511080387207
Explained Variance ratio by Principal Component  41  :  0.0036811022474497667
Explained Variance ratio by Principal Component  42  :  0.003532807590921863
Explained Variance ratio by Principal Component  43  :  0.0035221449980117033
Explained Variance ratio by Principal Component  44  :  0.0033683692074793284
Explained Variance ratio by Principal Component  45  :  0.003324213440610176
Explained Variance ratio by Principal Component  46  :  0.0032417350414558144
Explained Variance ratio by Principal Component  47  :  0.003097464371346726
Explained Variance ratio by Principal Component  48  :  0.0030493180022711022
Explained Variance ratio by Principal Component  49  :  0.0030174010370553972
Explained Variance ratio by Principal Component  50  :  0.002893053408362604
Explained Variance ratio by Principal Component  51  :  0.002831606959181239
Explained Variance ratio by Principal Component  52  :  0.002775687439238411

we are able to reduce dimensions from 384 to 52

  1. perform the PCA to obtain the principal components and their corresponding explained variance
In [24]:
# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Calculate the cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)

# Plot the explained variance ratio and cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', label='Explained Variance Ratio')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o', label='Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio / Cumulative Explained Variance')
plt.title('Explained Variance Ratio and Cumulative Explained Variance')
plt.legend()
plt.show()
In [25]:
pca_vectors = pca.fit_transform(scaled_df_x)

# Access and print the pca_vectors
print("PCA Vectors:")
print(pca_vectors)
PCA Vectors:
[[ 1.66418097e+01 -5.28637854e+00  5.90813596e+00 ...  1.47466546e+00
  -6.07350165e-03  7.87119195e-01]
 [ 1.65943085e+01 -4.94762736e+00  6.12864705e+00 ...  1.18524823e+00
  -1.03928883e-01  1.02606483e+00]
 [ 1.65927230e+01 -4.91301981e+00  6.17570643e+00 ...  1.15494987e+00
  -1.00923531e-01  8.95570094e-01]
 ...
 [-4.79214559e+00  1.42699675e+01  5.57219567e-01 ... -7.52523437e-01
   1.66459189e-01  7.02028650e-02]
 [ 1.73465338e+01 -3.01729764e+00  6.15848484e+00 ... -5.72477971e-01
  -1.88995484e-01  1.73028394e+00]
 [ 1.75212840e+01 -2.49155501e+00  5.31833961e+00 ... -4.57566760e-01
  -3.26204443e-01  1.49666375e+00]]
In [26]:
# Select a subset of principal components
subset_pca = pca_vectors[:, :5]  # Adjust the number of components as desired

# Create a DataFrame with the subset of principal components
df_subset_pca = pd.DataFrame(subset_pca, columns=[f'Principal Component {i+1}' for i in range(subset_pca.shape[1])])

# Create a scatter plot matrix
sns.set(style='ticks')
sns.pairplot(df_subset_pca)
plt.suptitle('Scatter Plot Matrix of Subset Principal Components')
plt.show()
In [27]:
sns.set(style="ticks")
sns.set_palette(palette='Set1')

fig_1 = plt.figure(figsize=(18,6))

sns.set(style="ticks")
sns.set_palette(palette='Set1')

sns.regplot(x=pca_vectors[:,0],y=df_y, label='Principal Component 1',x_bins=10)
sns.regplot(x=pca_vectors[:,1],y=df_y, label='Principal Component 2',x_bins=10)
sns.regplot(x=pca_vectors[:,2],y=df_y, label='Principal Component 3',x_bins=10)

plt.title('Most Important Principal Components vs Reference Value')
plt.xlabel('Principal Component Value')
plt.ylabel('Reference Value')
plt.legend()
plt.show()
In [28]:
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train)
In [29]:
# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot the target variable (reference) as color gradient
sc = ax.scatter(pca_vectors[:, 0], pca_vectors[:, 1], df_y, c=df_y, cmap='viridis')

# Add labels and title
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Reference')
ax.set_title('3D Plot: Reference vs. Principal Components 1 and 2')

# Add color bar
cbar = fig.colorbar(sc)
cbar.set_label('Reference')

# Show the plot
plt.show()

Linear Regression with PCA features¶

In [30]:
# spliting data to training and testing
train_x, test_x, train_y, test_y = train_test_split(pca_vectors, df_y, test_size=0.3, random_state=42)
In [31]:
# Initializing Linear Regression model
lm_pca = LinearRegression()
lm_pca.fit(train_x, train_y)
Out[31]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [32]:
# Evaluation on testing set

y_pred_linear_test_pca = lm_pca.predict(test_x)

mse = mean_squared_error(test_y, y_pred_linear_test_pca)
mae = mean_absolute_error(test_y, y_pred_linear_test_pca)
linear_r2 = r2_score(test_y, y_pred_linear_test_pca)

print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', linear_r2)
Test Set Mean Squared Error: 87.75937220286849
Test Set Mean Absolute Error: 6.881495867430069
Test Set R-squared: 0.8232425133026595
In [33]:
# Evaluation on training set

y_pred_linear_train_pca = lm_pca.predict(train_x)

mse = mean_squared_error(train_y, y_pred_linear_train_pca)
mae = mean_absolute_error(train_y, y_pred_linear_train_pca)
r2 = r2_score(y_train, y_pred_linear_train_pca)

print('Train Set Mean Squared Error:', mse)
print('Train Set Mean Absolute Error:', mae)
print('Train Set R-squared:', r2)
Train Set Mean Squared Error: 89.76536510098475
Train Set Mean Absolute Error: 6.93228434511436
Train Set R-squared: 0.820686825605212
In [34]:
# Predict the target variable on the test set
y_test_pred = lm_pca.predict(test_x)

# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(test_y, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(test_y), max(test_y)], [min(test_y), max(test_y)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()

Ridge Regression¶

In [36]:
# Create a Ridge regression model
ridge = Ridge(alpha=0.3)

ridge.fit(train_x, train_y)
Out[36]:
Ridge(alpha=0.3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge(alpha=0.3)
In [37]:
# Evaluate the model on the testing set

y_pred_ridge_test = ridge.predict(test_x)

mse = mean_squared_error(test_y, y_pred_ridge_test)
mae = mean_absolute_error(test_y, y_pred_ridge_test)
ridge_r2 = r2_score(test_y, y_pred_ridge_test)

print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', ridge_r2)
Test Set Mean Squared Error: 87.75936425110947
Test Set Mean Absolute Error: 6.881494004018261
Test Set R-squared: 0.8232425293184183
In [38]:
# Predict the target variable on the training set
y_pred_ridge_train = ridge.predict(train_x)

# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(train_y, y_pred_ridge_train)
train_mae = mean_absolute_error(train_y, y_pred_ridge_train)
train_r2 = r2_score(train_y, y_pred_ridge_train)

print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 89.76536510173793
Train Set Mean Absolute Error: 6.932282934455073
Train Set R-squared: 0.8206868256037074
In [39]:
# Predict the target variable on the test set
y_test_pred = ridge.predict(test_x)

# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()

ElasticNetCV¶

In [40]:
# Fit an Elastic Net regression model
regr_en = ElasticNetCV(cv=5, alphas=[0.1, 0.3, 0.5, 0.7, 1.0])
regr_en.fit(train_x, train_y)
Out[40]:
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)
In [41]:
# Evaluate the model on the testing set
y_pred_elastic_test = regr_en.predict(test_x)

mse = mean_squared_error(test_y, y_pred_elastic_test)
mae = mean_absolute_error(test_y, y_pred_elastic_test)
elastic_r2 = r2_score(test_y, y_pred_elastic_test)

print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:',elastic_r2)

# Iterate and fine-tune the model as needed by adjusting PCA components, alpha, l1_ratio, or other parameters
# Evaluate the model's performance and repeat the process until satisfactory results are achieved.
Test Set Mean Squared Error: 87.75598081697953
Test Set Mean Absolute Error: 6.857716549194305
Test Set R-squared: 0.8232493439445739
In [42]:
# Predict the target variable on the training set
y_pred_elastic_train = regr_en.predict(train_x)

# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(train_y, y_pred_elastic_train)
train_mae = mean_absolute_error(train_y, y_pred_elastic_train)
train_r2 = r2_score(train_y, y_pred_elastic_train)

print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 89.9141725526254
Train Set Mean Absolute Error: 6.914168265980384
Train Set R-squared: 0.8203895713524468
In [43]:
# Predict the target variable on the test set
y_test_pred = regr_en.predict(test_x)

# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(test_y, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()

DecisionTreeRegressor¶

In [44]:
# Split the data into training and testing sets (using the PCA vectors)
X_train, X_test, y_train, y_test = train_test_split(pca_vectors, df_y, test_size=0.3, random_state=42)

# Create the Decision Tree Regression model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
dt_regressor.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred_decision_test = dt_regressor.predict(X_test)

# Evaluate the model's performance on the testing set
mse = mean_squared_error(y_test, y_pred_decision_test)
mae = mean_absolute_error(y_test, y_pred_decision_test)
decision_r2 = r2_score(y_test, y_pred_decision_test)

print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', decision_r2)
Test Set Mean Squared Error: 12.940707700156338
Test Set Mean Absolute Error: 0.7184258982866043
Test Set R-squared: 0.97393592374525
In [45]:
# Predict the target variable on the training set
y_pred_decision_train = dt_regressor.predict(X_train)

# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(y_train, y_pred_decision_train)
train_mae = mean_absolute_error(y_train, y_pred_decision_train)
train_r2 = r2_score(y_train, y_pred_decision_train)

print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 1.979743065754344e-06
Train Set Mean Absolute Error: 1.0282376502002683e-05
Train Set R-squared: 0.999999996045312
In [48]:
# Predict the target variable on the test set
y_test_pred = dt_regressor.predict(X_test)

# Plot the predictions against the actual reference values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('DT Regressor: Actual vs. Predicted Reference Values')
plt.show()

Ensemble model 1¶

with pca vector models

PCA input data goes to linear,ridge, elastic and decision tree model.

In [49]:
meta_train_features = np.column_stack((y_pred_linear_train_pca, y_pred_ridge_train,
                                      y_pred_elastic_train, y_pred_decision_train))
In [51]:
# Define the parameter grid
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 5, 10],
    'reg_alpha': [0, 1e-5, 1e-2, 0.1, 1],
    'reg_lambda': [0, 1e-5, 1e-2, 0.1, 1]
}

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_model, param_distributions=param_dist,
    n_iter=10, scoring='neg_mean_squared_error',
    cv=5, verbose=1, random_state=42
)

# Perform the randomized search
random_search.fit(meta_train_features, train_y)

# Get the best hyperparameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Hyperparameters:", best_params)
print("\nBest Score (Negative MSE):", best_score)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 1, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 1.0}

Best Score (Negative MSE): -0.003914909703952625
In [52]:
for key, value in best_params.items():
    print(f"{key}: {value}")
subsample: 0.9
reg_lambda: 0
reg_alpha: 1
n_estimators: 300
min_child_weight: 1
max_depth: 4
learning_rate: 0.2
gamma: 0
colsample_bytree: 1.0
In [53]:
# Meta-Model Selection and Training

# Use the best hyperparameters obtained from hyperparameter tuning
best_subsample = 0.9
best_reg_lambda = 0
best_reg_alpha = 1
best_n_estimators = 300
best_min_child_weight = 1
best_max_depth = 4
best_learning_rate = 0.2
best_gamma = 0
best_colsample_bytree = 1.0

meta_model = xgb.XGBRegressor(
    subsample=best_subsample,
    reg_lambda=best_reg_lambda,
    reg_alpha=best_reg_alpha,
    n_estimators=best_n_estimators,
    min_child_weight=best_min_child_weight,
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    gamma=best_gamma,
    colsample_bytree=best_colsample_bytree,
    random_state=42
)

meta_model.fit(meta_train_features, train_y)
Out[53]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=300, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=300, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)
In [54]:
# Evaluation on testing set

meta_test_features = np.column_stack((y_pred_linear_test_pca, y_pred_ridge_test,
                                      y_pred_elastic_test, y_pred_decision_test))

ensemble_test_predictions = meta_model.predict(meta_test_features)

ensemble_mse = mean_squared_error(test_y, ensemble_test_predictions)
print("Test Set Ensemble MSE:", ensemble_mse)

ensemble_r2 = r2_score(test_y, ensemble_test_predictions)
print("Test Set Ensemble R-squared:", ensemble_r2)
Test Set Ensemble MSE: 12.943920760380433
Test Set Ensemble R-squared: 0.9739294522717701
In [55]:
# Evaluation on training set

meta_train_features = np.column_stack((y_pred_linear_train, y_pred_ridge_train,
                                      y_pred_elastic_train, y_pred_decision_train))

ensemble_train_predictions = meta_model.predict(meta_train_features)

ensemble_mse = mean_squared_error(train_y, ensemble_train_predictions)
print("Train Set Ensemble MSE:", ensemble_mse)

ensemble_r2 = r2_score(train_y, ensemble_train_predictions)
print("Train Set Ensemble R-squared:", ensemble_r2)
Train Set Ensemble MSE: 0.0033983004727524296
Train Set Ensemble R-squared: 0.9999932116351933
In [56]:
# Plotting prediction vs actual values
plt.scatter(test_y, ensemble_test_predictions)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()
In [57]:
residuals = test_y - ensemble_test_predictions
plt.scatter(test_y, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()
In [58]:
# List of R-squared values for each model
r2_values = [ensemble_r2, linear_r2, ridge_r2, elastic_r2, decision_r2]

# List of model names
model_names = ['Ensemble', 'Linear', 'Ridge', 'Elastic Net', 'Decision Tree']

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(model_names, r2_values, color=['blue', 'green', 'orange', 'red', 'purple'])
plt.title('Comparison of R-squared Values for Different Models')
plt.xlabel('Model')
plt.ylabel('R-squared Value')
plt.ylim(0, 1)
plt.show()

LIME (Local Interpretable Model-agnostic Explanations)

In [59]:
# Create a LimeTabularExplainer instance
explainer = lime_tabular.LimeTabularExplainer(
    training_data=meta_test_features,        # The training dataset used for generating local explanations
    feature_names=["linearModel", "ridgeModel", "elasticModel", "decisionTree"], # Names of the features/columns in the dataset
    mode="regression"                # Specify mode as "regression" for regression problems
)

# Explain an instance using LIME
explain_lime = explainer.explain_instance(
    meta_test_features[20],      # The instance to be explained (in this case, index 20 of the dataset)
    meta_model.predict,          # The prediction function of the meta-model
    num_features=4               # Number of top features to show in the explanation
)

# Display the LIME explanation in a notebook
explain_lime.show_in_notebook()

k-fold Cross Validation

In [63]:
# Perform cross-validation and obtain predictions using cross_val_predict
ensemble_train_predictions = cross_val_predict(meta_model, meta_train_features, train_y, cv=5)

# Calculate cross-validation metrics
ensemble_mse_train_cv = mean_squared_error(train_y, ensemble_train_predictions)
ensemble_r2_train_cv = r2_score(train_y, ensemble_train_predictions)

print("Cross-Validation Train Set Ensemble MSE:", ensemble_mse_train_cv)
print("Cross-Validation Train Set Ensemble R-squared:", ensemble_r2_train_cv)
Cross-Validation Train Set Ensemble MSE: 0.00402036325103711
Cross-Validation Train Set Ensemble R-squared: 0.9999919690172713
In [64]:
# Data Preparation: Splitting dataset into training, validation, and test sets.
X_train, X_temp, y_train, y_temp = train_test_split(df_x, df_y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Ensemble model 2¶

With new models

all models are using features that are not reduced using pca

In [65]:
# Train Base Models
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
elastic_net = ElasticNetCV(cv=5)
decision_tree = DecisionTreeRegressor(max_depth=5)
In [66]:
linear_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
elastic_net.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
Out[66]:
DecisionTreeRegressor(max_depth=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=5)
In [67]:
# Collect Predictions on Validation Set
val_predictions_linear = linear_reg.predict(X_val)
val_predictions_ridge = ridge_reg.predict(X_val)
val_predictions_elastic = elastic_net.predict(X_val)
val_predictions_tree = decision_tree.predict(X_val)
In [68]:
# Using XGBoost as the meta-model
meta_features = np.column_stack((val_predictions_linear, val_predictions_ridge,
                                 val_predictions_elastic, val_predictions_tree))
In [69]:
# Define the parameter grid
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_model, param_distributions=param_dist,
    n_iter=10, scoring='neg_mean_squared_error',
    cv=5, verbose=1, random_state=42
)

# Perform the randomized search
random_search.fit(meta_features, y_val)

# Get the best hyperparameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score (Negative MSE):", best_score)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best Score (Negative MSE): -36.0905718275611
In [70]:
# Meta-Model Selection and Training

# Use the best hyperparameters obtained from hyperparameter tuning
best_subsample = 1.0
best_n_estimators = 200
best_max_depth = 4
best_learning_rate = 0.1
best_colsample_bytree = 1.0

meta_model = xgb.XGBRegressor(
    subsample=best_subsample,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    learning_rate=best_learning_rate,
    colsample_bytree=best_colsample_bytree,
    random_state=42
)

meta_model.fit(meta_features, y_val)
Out[70]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)
In [71]:
# Predict with the Ensemble on the Test Set
test_predictions_linear = linear_reg.predict(X_test)
test_predictions_ridge = ridge_reg.predict(X_test)
test_predictions_elastic = elastic_net.predict(X_test)
test_predictions_tree = decision_tree.predict(X_test)

meta_test_features = np.column_stack((test_predictions_linear, test_predictions_ridge,
                                      test_predictions_elastic, test_predictions_tree))

ensemble_predictions = meta_model.predict(meta_test_features)
In [72]:
# Evaluation on testing set

ensemble_mse = mean_squared_error(y_test, ensemble_predictions)
print("Test Set Ensemble MSE:", ensemble_mse)

ensemble_r2 = r2_score(y_test, ensemble_predictions)
print("Test Set Ensemble R-squared:", ensemble_r2)
Test Set Ensemble MSE: 38.6631255329587
Test Set Ensemble R-squared: 0.9219275233928775
In [73]:
plt.scatter(y_test, ensemble_predictions)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()
In [74]:
residuals = y_test - ensemble_predictions
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()

LIME (Local Interpretable Model-agnostic Explanations)

In [80]:
explainer = lime_tabular.LimeTabularExplainer(training_data=meta_features, feature_names=["linearModel", "ridgeModel", "elasticModel", "decisionTree"], mode="regression")

explain_lime = explainer.explain_instance(meta_features[10], meta_model.predict, num_features=4)
explain_lime.show_in_notebook()